R Markdown
setwd("/Users/macbook/Desktop/Bio-Research\ /PS050_cellranger_count_outs/raw_feature_bc_matrix")
EpCAM.data <- Read10X(data.dir = "/Users/macbook/Desktop/Bio-Research\ /PS050_cellranger_count_outs/filtered_feature_bc_matrix/")
# Create Seurat object
EpCAM <- CreateSeuratObject(counts = EpCAM.data, project = "mSG_EpCAM", save.SNN = TRUE)
EpCAM
## An object of class Seurat
## 32285 features across 8277 samples within 1 assay
## Active assay: RNA (32285 features, 0 variable features)
# Add number of genes per UMI for each cell to metadata
# EpCAM$log10GenesPerUMI <- log10(EpCAM$nFeature_RNA) - log10(EpCAM$nCount_RNA) ?
EpCAM$log10GenesPerUMI <- log10(EpCAM$nFeature_RNA) / log10(EpCAM$nCount_RNA)
# Compute percent mito ratio
# What is the meaning of `pattern = "^mt-" ` here?
EpCAM$mitoRatio <- PercentageFeatureSet(object = EpCAM, pattern = "^mt-")
EpCAM$mitoRatio <- EpCAM@meta.data$mitoRatio / 100
# Create metadata dataframe
metadata <- EpCAM@meta.data
# Add cell IDs to metadata
metadata$cells <- rownames(metadata)
# Rename columns
metadata <- metadata %>%
dplyr::rename(seq_folder = orig.ident,
nUMI = nCount_RNA,
nGene = nFeature_RNA)
# Create sample column
metadata$sample <- "mSG"
# Create metadata dataframe
metadata <- EpCAM@meta.data
# Add cell IDs to metadata
metadata$cells <- rownames(metadata)
# Rename columns
metadata <- metadata %>%
dplyr::rename(seq_folder = orig.ident,
nUMI = nCount_RNA,
nGene = nFeature_RNA)
# Create sample column
metadata$sample <- "mSG"
# Add metadata back to Seurat object
EpCAM@meta.data <- metadata
# Cell-level FILTERING
set.seed (12)
filtered_seurat <- subset(x = EpCAM,
subset= (nUMI >= 1) &
(nUMI <=20000) &
(nGene >= 100) &
(log10GenesPerUMI > 0.5) &
(log10GenesPerUMI < 0.9) &
(mitoRatio < 0.20))
# Output a logical vector for every gene on whether the more than zero counts per cell
# Extract counts
counts <- GetAssayData(object = filtered_seurat, slot = "counts")
# Save filtered subset to new metadata
metadata_clean <- filtered_seurat@meta.data
filtered_seurat[["percent.mt"]] <- PercentageFeatureSet(filtered_seurat, pattern = "^mt-")
plot1 <- FeatureScatter(filtered_seurat, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(filtered_seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2+ plot_layout(ncol = 1)

# Normalize the counts
set.seed(12)
filtered_seurat_norm <- SCTransform(filtered_seurat, vars.to.regress = "mitoRatio")
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 14745 by 2806
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 2806 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Found 108 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 14745 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========= | 13%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 33%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 14745 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========= | 13%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 33%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 37.43277 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out mitoRatio
## Centering data matrix
## Set default assay to SCT
# Check which assays are stored in objects
filtered_seurat_norm@assays
## $RNA
## Assay data with 32285 features for 2806 cells
## First 10 features:
## Xkr4, Gm1992, Gm19938, Gm37381, Rp1, Sox17, Gm37587, Gm37323, Mrpl15,
## Lypla1
##
## $SCT
## SCTAssay data with 14745 features for 2806 cells, and 1 SCTModel(s)
## Top 10 variable features:
## Klk1, Wfdc18, Smgc, Klk1b26, Pip, Scgb1b27, Scgb2b27, Car6, Klk1b11,
## Mucl2
# Identification of highly variable features (feature selection)
filtered_seurat_norm <- FindVariableFeatures(filtered_seurat_norm, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(filtered_seurat_norm), 10)
par(plt = c(0.1, 0.9, 0.1, 0.9))
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(filtered_seurat_norm)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2+ plot_layout(ncol = 1)

# Run PCA
set.seed (12)
filtered_seurat_norm_PCA <- RunPCA(object = filtered_seurat_norm)
## PC_ 1
## Positive: S100a6, Fxyd2, Nupr1, Actg1, Mt1, Anxa1, Crisp3, Cldn4, Stap1, Ubc
## Ifi27l2a, Actb, Krt18, Tnfrsf12a, Ifrd1, Atf3, Krt8, Ifitm3, Mt2, Ascl3
## Maff, Ubb, S100a10, Ier3, Gadd45a, Ier5, Cebpd, Ly6d, Krt7, Ldhb
## Negative: Pip, Wfdc12, Smgc, A630073D07Rik, Scgb2b27, Scgb1b27, Aqp5, Mucl2, Prol1, Car6
## Scgb2b26, Cldn10, Prlr, Phyh, Lars2, Lman1l, Gm44410, St3gal4, Serpinb11, Tll1
## Gm26917, Agt, Col11a1, Lrrc26, Fxyd3, Gdpd1, Hp, Rexo2, H2afj, Fam25c
## PC_ 2
## Positive: Klk1, Klk1b11, Klk1b26, Klk1b5, Klk1b4, Klk1b9, Bglap3, Klk1b3, Klk1b16, Ly6a
## Lgals3, Fosb, Scgb1c1, Crisp3, Klk1b22, Cyp2f2, Klk1b21, Timp3, Ces1d, Pcp4l1
## Klk1b27, Krt8, Foxq1, Arg1, Serpinb1a, Cldn4, Ngf, Egf, Fos, G0s2
## Negative: Stap1, S100a6, Ifi27l2a, Slc12a2, Ascl3, Hepacam2, Nupr1, Slc4a11, Ifitm3, Aldh1a3
## Pam, Foxi1, Serpinb9, Itih5, Pip, Rcan2, Ncald, Asgr1, Tspan1, Smgc
## Ldhb, Cebpd, Cpxm2, Vps36, Itpr2, Nrip3, Pdlim3, Aldh1a1, Rbpms, A630073D07Rik
## PC_ 3
## Positive: Wfdc18, Esp4, Car2, Tmem176b, Tmem176a, Ccnd1, Gjb2, Cyp2f2, Ctsh, Mgst1
## Hp, Aox3, BC006965, Ano1, Esp18, Elf5, Pir, Barx2, Slc13a2, Ehf
## Fth1, Aldoc, Palmd, Ces1d, Cdo1, Cyba, Cebpb, Taldo1, Lmo4, Nkd2
## Negative: Klk1, Crisp3, Klk1b26, Fxyd2, Klk1b11, Klk1b5, Klk1b9, Klk1b4, Smgc, Pip
## Mt1, Klk1b3, A630073D07Rik, Phyh, Klk1b22, Klk1b16, Aqp5, Mt2, Scgb1c1, Wfdc12
## Klk1b21, Cldn10, Klk1b27, Stap1, S100a6, Ngf, Wfdc2, Serpinb1a, Lman1l, Gm44410
## PC_ 4
## Positive: Crisp3, Fxyd2, Wfdc18, Ifi27l2a, Bglap3, Nupr1, Car2, Krt18, Anxa1, Stap1
## Esp4, Cyp2f2, Krt8, Ascl3, Slc12a2, Klk1b11, Klk1b26, Ldhb, Cldn4, Hepacam2
## Cryab, Gjb2, Ckmt1, Serpinb6b, Cited4, Cxcl17, Ncald, Actg1, Tmsb4x, Serpinb9
## Negative: Dcn, Sparc, Smoc2, Apoe, Rbp1, Igfbp4, Mgp, Col4a1, Vim, Serpinh1
## Lgals1, Htra1, Igfbp7, Cd34, Ebf1, Mdk, Rnase4, Hspg2, Cxcl12, Emp3
## Ppic, Aebp1, Plpp3, Efemp1, Lrp1, AW112010, C1s1, Lamb1, Fbln2, Meg3
## PC_ 5
## Positive: Actg1, Ifrd1, Krt8, Arid5b, Krt18, Actb, Tnfrsf12a, Cldn4, Pip, S100a10
## Ly6d, Sfn, Smgc, Tuba1c, Maff, Fosl1, Lmna, Fhl2, H3f3b, Tacstd2
## Ndrg1, Mast4, Cdkn1a, A630073D07Rik, Cldn10, Sprr1a, Ptn, Hspb8, Wfdc12, Phyh
## Negative: Crisp3, Ifi27l2a, Klk1b5, Fxyd2, Klk1b4, G0s2, Klk1b26, Klk1, Klk1b9, Klk1b22
## Fos, Klk1b16, Klk1b3, Stap1, Ascl3, Scgb1c1, Esp4, Klk1b21, Bglap3, Klk1b11
## Itih5, Tmem176b, Tmem176a, Klk1b27, Ngf, Cd81, Car2, Ldhb, Hepacam2, Hspa1a
# Plot PCA
PCAPlot(filtered_seurat_norm_PCA)

# Run UMAP
set.seed(12)
filtered_seurat_norm_UMAP <- RunUMAP(filtered_seurat_norm_PCA,
dims = 1:15,
reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 16:23:16 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:23:16 Read 2806 rows and found 15 numeric columns
## 16:23:16 Using Annoy for neighbor search, n_neighbors = 30
## 16:23:16 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:23:17 Writing NN index file to temp file /var/folders/v2/mpyxd00n14zdpng47ksgc4z80000gn/T//RtmpNUNMgi/file1408a1f654025
## 16:23:17 Searching Annoy index using 1 thread, search_k = 3000
## 16:23:17 Annoy recall = 100%
## 16:23:18 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 16:23:19 Initializing from normalized Laplacian + noise (using irlba)
## 16:23:19 Commencing optimization for 500 epochs, with 112676 positive edges
## 16:23:23 Optimization finished
# Plot UMAP
DimPlot(filtered_seurat_norm_UMAP)

# Run t-SNE
filtered_seurat_norm_tSNE <- RunTSNE(
filtered_seurat_norm_PCA, reduction = "pca",
assay = NULL,
seed.use = 12,
tsne.method = "Rtsne",
dim.embed = 2,
dims = 1:20,
reduction.key = "tSNE_")
# Plot t-SNE
TSNEPlot(filtered_seurat_norm_tSNE)

# CLUSTERING CELLS BASED ON TOP PCs (METAGENES)
# Identify significant PCs
DimHeatmap(filtered_seurat_norm_tSNE,
dims = 1:12,
cells = 500,
balanced = TRUE)

# Printing out the most variable genes driving PCs
print(x = filtered_seurat_norm_tSNE[["pca"]],
dims = 1:12,
nfeatures = 15)
## PC_ 1
## Positive: S100a6, Fxyd2, Nupr1, Actg1, Mt1, Anxa1, Crisp3, Cldn4, Stap1, Ubc
## Ifi27l2a, Actb, Krt18, Tnfrsf12a, Ifrd1
## Negative: Pip, Wfdc12, Smgc, A630073D07Rik, Scgb2b27, Scgb1b27, Aqp5, Mucl2, Prol1, Car6
## Scgb2b26, Cldn10, Prlr, Phyh, Lars2
## PC_ 2
## Positive: Klk1, Klk1b11, Klk1b26, Klk1b5, Klk1b4, Klk1b9, Bglap3, Klk1b3, Klk1b16, Ly6a
## Lgals3, Fosb, Scgb1c1, Crisp3, Klk1b22
## Negative: Stap1, S100a6, Ifi27l2a, Slc12a2, Ascl3, Hepacam2, Nupr1, Slc4a11, Ifitm3, Aldh1a3
## Pam, Foxi1, Serpinb9, Itih5, Pip
## PC_ 3
## Positive: Wfdc18, Esp4, Car2, Tmem176b, Tmem176a, Ccnd1, Gjb2, Cyp2f2, Ctsh, Mgst1
## Hp, Aox3, BC006965, Ano1, Esp18
## Negative: Klk1, Crisp3, Klk1b26, Fxyd2, Klk1b11, Klk1b5, Klk1b9, Klk1b4, Smgc, Pip
## Mt1, Klk1b3, A630073D07Rik, Phyh, Klk1b22
## PC_ 4
## Positive: Crisp3, Fxyd2, Wfdc18, Ifi27l2a, Bglap3, Nupr1, Car2, Krt18, Anxa1, Stap1
## Esp4, Cyp2f2, Krt8, Ascl3, Slc12a2
## Negative: Dcn, Sparc, Smoc2, Apoe, Rbp1, Igfbp4, Mgp, Col4a1, Vim, Serpinh1
## Lgals1, Htra1, Igfbp7, Cd34, Ebf1
## PC_ 5
## Positive: Actg1, Ifrd1, Krt8, Arid5b, Krt18, Actb, Tnfrsf12a, Cldn4, Pip, S100a10
## Ly6d, Sfn, Smgc, Tuba1c, Maff
## Negative: Crisp3, Ifi27l2a, Klk1b5, Fxyd2, Klk1b4, G0s2, Klk1b26, Klk1, Klk1b9, Klk1b22
## Fos, Klk1b16, Klk1b3, Stap1, Ascl3
## PC_ 6
## Positive: Fos, Zfp36, Atf3, Junb, Smgc, Jun, Btg2, Egr1, Ier2, Irf1
## Gadd45g, H3f3b, Fosb, Ier3, Gadd45b
## Negative: Mucl2, Prol1, Scgb1b27, Car6, Scgb2b26, Scgb2b27, S100a6, Ly6d, S100a10, Ptn
## Agt, Dcn, Pigr, Ndrg1, F3
## PC_ 7
## Positive: Smgc, Phyh, Rps8, Fxyd3, Cyp2f2, Rpl13, A630073D07Rik, Cldn10, Prlr, Hp
## Eef1a1, Gm44410, Rplp0, Rpl32, Serpinb11
## Negative: Mucl2, Scgb1b27, Prol1, Car6, Scgb2b27, Scgb2b26, Atf3, Fos, Agt, Egr1
## Fosb, Gm26917, Jun, Pigr, Lpo
## PC_ 8
## Positive: Ifrd1, Krt8, Krt18, Anxa1, Cldn4, Bglap3, Actg1, Clu, Krt23, Pip
## Arid5b, Lgals3, Actb, Wfdc12, Ubc
## Negative: Fos, Ly6d, Jun, Ptn, Krt14, Dusp1, Hspa1a, Ucma, Ccn1, Krt5
## Lgals7, Krt17, Dcn, Ier2, Smoc2
## PC_ 9
## Positive: Isg15, Iigp1, Ifit1, Ifitm3, Ly6a, Pecam1, Srgn, Fabp4, Ecscr, Tm4sf1
## Rsad2, Emcn, Rasip1, Cd93, Flt1
## Negative: Dcn, Fos, S100a6, Egr1, Atf3, Nupr1, Krt18, Smoc2, Fosb, Mt1
## Jun, Rbp1, Gsn, Esp18, Klf6
## PC_ 10
## Positive: Klk1b22, Klk1b4, Klk1b5, Klk1b21, Egf, Ngf, Klk1b26, Klk1b3, Gm42418, Gm26917
## Wfdc18, Scgb1c1, Cracr2a, Esp4, Ier5
## Negative: Fxyd2, Cat, Ly6a, Bglap3, Fos, Arg1, Ces1d, Gstm1, Sord, Duoxa2
## Tacstd2, Krt7, Cyp2f2, Duox2, Wfdc2
## PC_ 11
## Positive: Gm26917, Gm42418, Malat1, Lars2, Cracr2a, Ppp1r10, Hspa1b, Neat1, Mafb, Hexim1
## Klk1, Jun, Clec2d, 2900060B14Rik, Hspa1a
## Negative: Atf3, Klk1b4, Pip, Mt1, Agt, Wfdc12, Bglap3, Klk1b5, Irf1, Ifitm3
## Zfp36, Crisp3, Nfkbia, Ier3, Cxcl10
## PC_ 12
## Positive: Hspa1a, Hspb1, Actg1, Hspa1b, Esp18, Actb, Ly6d, Hspe1, Timp3, Ubc
## Nupr1, Krt8, Hspa8, Fxyd2, Hsp90aa1
## Negative: Bglap3, Cxcl10, Gm26917, Irf1, Isg15, Ifi27l2a, Ifitm3, Ifit1, Irf7, Nfkbia
## Gm42418, Hsd11b1, Oasl2, Lars2, Duox2
VizDimLoadings(filtered_seurat_norm_tSNE, dims = 1, reduction = "pca")

VizDimLoadings(filtered_seurat_norm_tSNE, dims = 2, reduction = "pca")

VizDimLoadings(filtered_seurat_norm_tSNE, dims = 3, reduction = "pca")

# Elbow plot: threshold for identifying the majority of the variation. However, this method can be quite subjective.
ElbowPlot(object = filtered_seurat_norm_UMAP,
ndims = 40)

# CLUSTER THE CELLS
# Determine the K-nearest neighbor graph
set.seed (12)
filtered_seurat_norm_UMAP <- FindNeighbors(object = filtered_seurat_norm_UMAP, reduction = "pca", dims = 1:15, verbose = TRUE, graph.name = "mSG")
## Computing nearest neighbor graph
## Computing SNN
## Only one graph name supplied, storing nearest-neighbor graph only
# Determine the clusters with Leiden algorithm
set.seed (12)
filtered_seurat_norm_UMAP_0.5 <- FindClusters(object = filtered_seurat_norm_UMAP, verbose = TRUE, algorithm = 4, resolution = 0.5, graph.name = "mSG")
set.seed(12)
distance_matrix <- dist(Embeddings(filtered_seurat_norm_UMAP_0.5[['umap']])[, 1:2])
#Isolate cluster name
clusters <- filtered_seurat_norm_UMAP_0.5@active.ident
#Compute silhouette score for each cluster
silhouette <- silhouette(as.numeric(clusters), dist = distance_matrix)
filtered_seurat_norm_UMAP_0.5@meta.data$silhouette_score <- silhouette[,3]
#Plot
fviz_silhouette(silhouette, label = FALSE, print.summary = TRUE)
## cluster size ave.sil.width
## 1 1 672 0.75
## 2 2 474 0.18
## 3 3 370 0.66
## 4 4 364 0.81
## 5 5 255 0.40
## 6 6 221 0.53
## 7 7 168 0.61
## 8 8 151 -0.16
## 9 9 131 0.63

# Determine metrics to plot present in seurat_integrated@meta.data
metrics <- c("nUMI", "nGene", "mitoRatio")
FeaturePlot(filtered_seurat_norm_UMAP_0.5,
reduction = "umap",
features = metrics,
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)

# Basal - Cluster 8
FeaturePlot(filtered_seurat_norm_UMAP_0.5_RNA,
reduction = "umap",
features = c("Krt15", "Krt14"),
order = TRUE,
min.cutoff = 'q10',
label = TRUE,
repel = TRUE)

# ACINAR 1 - Cluster 4
FeaturePlot(filtered_seurat_norm_UMAP_0.5_RNA,
reduction = "umap",
features = c("Pip", # for 1 and 2
"Scgb2b27", "Lpo", "Car6"), # for 1
order = TRUE,
min.cutoff = 'q10',
label = TRUE,
repel = TRUE)

# ACINAR 2
FeaturePlot(filtered_seurat_norm_UMAP_0.5_RNA,
reduction = "umap",
features = c("Pip", # for 1 and 2
"Prlr", "Smgc"), # for 2
order = TRUE,
min.cutoff = 'q10',
label = TRUE,
repel = TRUE)

# Ductal 1
FeaturePlot(filtered_seurat_norm_UMAP_0.5_RNA,
reduction = "umap",
features = c("Klk1",
"Ngf"),
order = TRUE,
min.cutoff = 'q10',
label = TRUE,
repel = TRUE)

# Ductal 2
FeaturePlot(filtered_seurat_norm_UMAP_0.5_RNA,
reduction = "umap",
features = c("Foxi1",
"Ascl3"),
order = TRUE,
min.cutoff = 'q10',
label = TRUE,
repel = TRUE)

# Ductal 3
FeaturePlot(filtered_seurat_norm_UMAP_0.5_RNA,
reduction = "umap",
features = c("Slc9a3", "Clic6"),
order = TRUE,
min.cutoff = 'q10',
label = TRUE,
repel = TRUE)
